{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here I will to show how to use linear model stochastic gradient descent on multi-class classification/discrimination\n",
    "\n",
    "import class sklearn.linear_model.SGDClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "import numpy as np\n",
    "import sklearn.datasets\n",
    "import re\n",
    "from sklearn.feature_extraction.text import HashingVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.cross_validation import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define some functions to help us on preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# clear string\n",
    "def clearstring(string):\n",
    "    string = re.sub('[^\\\"\\'A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = ' '.join(string)\n",
    "    return string\n",
    "\n",
    "# because of sklean.datasets read a document as a single element\n",
    "# so we want to split based on new line\n",
    "def separate_dataset(trainset):\n",
    "    datastring = []\n",
    "    datatarget = []\n",
    "    for i in range(len(trainset.data)):\n",
    "        data_ = trainset.data[i].split('\\n')\n",
    "        # python3, if python2, just remove list()\n",
    "        data_ = list(filter(None, data_))\n",
    "        for n in range(len(data_)):\n",
    "            data_[n] = clearstring(data_[n])\n",
    "        datastring += data_\n",
    "        for n in range(len(data_)):\n",
    "            datatarget.append(trainset.target[i])\n",
    "    return datastring, datatarget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']\n",
      "416809\n",
      "416809\n"
     ]
    }
   ],
   "source": [
    "# you can change any encoding type\n",
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset)\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
      "  \" in version 0.21.\", DeprecationWarning)\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
      "  \" in version 0.21.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "# bag-of-word\n",
    "bow = CountVectorizer().fit_transform(trainset.data)\n",
    "\n",
    "#tf-idf, must get from BOW first\n",
    "tfidf = TfidfTransformer().fit_transform(bow)\n",
    "\n",
    "#hashing, default n_features, probability cannot divide by negative\n",
    "hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### loss function got {'modified_huber', 'hinge', 'log', 'squared_hinge', 'perceptron'}\n",
    "\n",
    "default is hinge, will give you classic SVM\n",
    "\n",
    "perceptron in linear loss\n",
    "\n",
    "huber and log both logistic classifier\n",
    "\n",
    "#### penalty got {'l1', 'l2'}, to prevent overfitting\n",
    "\n",
    "l1 = MAE (mean absolute error)\n",
    "\n",
    "l2 = RMSE (root mean square error)\n",
    "\n",
    "#### alpha is learning rate\n",
    "\n",
    "#### n_iter is number of epoch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.898586886111\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.91      0.88      0.90     11422\n",
      "       fear       0.84      0.87      0.86      9495\n",
      "        joy       0.90      0.94      0.92     28138\n",
      "       love       0.84      0.74      0.79      6970\n",
      "    sadness       0.93      0.94      0.94     24380\n",
      "   surprise       0.85      0.65      0.73      2957\n",
      "\n",
      "avg / total       0.90      0.90      0.90     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)\n",
    "\n",
    "mod_huber = SGDClassifier(loss = 'modified_huber', \n",
    "                                  penalty = 'l2', alpha = 1e-3, \n",
    "                                  n_iter = 10).fit(train_X, train_Y)\n",
    "predicted = mod_huber.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.850915285142\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.93      0.75      0.83     11542\n",
      "       fear       0.88      0.73      0.79      9610\n",
      "        joy       0.79      0.97      0.87     28110\n",
      "       love       0.92      0.55      0.69      6883\n",
      "    sadness       0.88      0.94      0.91     24230\n",
      "   surprise       0.91      0.46      0.61      2987\n",
      "\n",
      "avg / total       0.86      0.85      0.84     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)\n",
    "\n",
    "mod_huber = SGDClassifier(loss = 'modified_huber', \n",
    "                                  penalty = 'l2', alpha = 1e-3, \n",
    "                                  n_iter = 10).fit(train_X, train_Y)\n",
    "predicted = mod_huber.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.791163839639\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.92      0.64      0.76     11592\n",
      "       fear       0.87      0.59      0.70      9557\n",
      "        joy       0.71      0.97      0.82     28068\n",
      "       love       0.94      0.40      0.56      6933\n",
      "    sadness       0.83      0.90      0.87     24273\n",
      "   surprise       0.91      0.34      0.49      2939\n",
      "\n",
      "avg / total       0.82      0.79      0.78     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)\n",
    "\n",
    "mod_huber = SGDClassifier(loss = 'modified_huber', \n",
    "                                  penalty = 'l2', alpha = 1e-3, \n",
    "                                  n_iter = 10).fit(train_X, train_Y)\n",
    "predicted = mod_huber.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Always BOW got the highest accuracy among other vectorization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let we use linear model to do classifers, I will use BOW as vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.896079748566\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.91      0.88      0.89     11440\n",
      "       fear       0.86      0.85      0.86      9525\n",
      "        joy       0.89      0.95      0.92     28411\n",
      "       love       0.89      0.69      0.77      6839\n",
      "    sadness       0.92      0.95      0.93     24074\n",
      "   surprise       0.89      0.63      0.74      3073\n",
      "\n",
      "avg / total       0.90      0.90      0.89     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)\n",
    "\n",
    "svm = SGDClassifier(penalty = 'l2', alpha = 1e-3, n_iter = 10).fit(train_X, train_Y)\n",
    "predicted = svm.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.787709028094\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.90      0.75      0.82     11364\n",
      "       fear       0.83      0.49      0.62      9536\n",
      "        joy       0.68      0.95      0.79     28475\n",
      "       love       0.74      0.29      0.41      6945\n",
      "    sadness       0.91      0.93      0.92     24034\n",
      "   surprise       0.83      0.30      0.45      3008\n",
      "\n",
      "avg / total       0.80      0.79      0.77     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)\n",
    "\n",
    "sq_hinge = SGDClassifier(loss = 'squared_hinge', \n",
    "                                  penalty = 'l2', alpha = 1e-3, \n",
    "                                  n_iter = 10).fit(train_X, train_Y)\n",
    "predicted = sq_hinge.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.889506009933\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.90      0.89      0.90     11551\n",
      "       fear       0.85      0.83      0.84      9358\n",
      "        joy       0.90      0.92      0.91     28254\n",
      "       love       0.79      0.75      0.77      7003\n",
      "    sadness       0.93      0.93      0.93     24162\n",
      "   surprise       0.71      0.77      0.74      3034\n",
      "\n",
      "avg / total       0.89      0.89      0.89     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)\n",
    "\n",
    "perceptron = SGDClassifier(loss = 'perceptron', \n",
    "                                  penalty = 'l2', alpha = 1e-3, \n",
    "                                  n_iter = 10).fit(train_X, train_Y)\n",
    "predicted = perceptron.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But how to get probability of our output?\n",
    "\n",
    "Only applicable if your loss = {'log', 'modified_huber'} because both are logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.896751517478\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.90      0.89      0.89     11368\n",
      "       fear       0.85      0.85      0.85      9440\n",
      "        joy       0.90      0.95      0.92     28329\n",
      "       love       0.86      0.73      0.79      6977\n",
      "    sadness       0.93      0.94      0.94     24278\n",
      "   surprise       0.78      0.69      0.73      2970\n",
      "\n",
      "avg / total       0.90      0.90      0.90     83362\n",
      "\n",
      "['i m already feeling somewhat strange given that i get very good and while i can not open my eyes', 'i myself smiling through loving simple dialog child logic explain situation feelings it s funny']\n",
      "[5, 5]\n",
      "[[ 0.          0.4859605   0.10990839  0.          0.          0.40413111]\n",
      " [ 0.          0.          0.          0.45115266  0.          0.54884734]]\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)\n",
    "\n",
    "mod_huber = SGDClassifier(loss = 'modified_huber', \n",
    "                                  penalty = 'l2', alpha = 1e-3, \n",
    "                                  n_iter = 10).fit(train_X, train_Y)\n",
    "predicted = mod_huber.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))\n",
    "\n",
    "# get probability for first 2 sentence in our dataset\n",
    "print(trainset.data[:2])\n",
    "print(trainset.target[:2])\n",
    "print(mod_huber.predict_proba(bow[:2, :]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
